Loading datasets and shuffling¶
In [ ]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import joblib
import cv2
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix, roc_curve, auc
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.utils import shuffle
train_df = pd.read_csv("emnist-letters-train.csv")
test_df = pd.read_csv("emnist-letters-test.csv")
train_df.columns = [i for i in range(785)]
test_df.columns = [i for i in range(785)]
print(train_df.shape)
print(train_df.info())
print(test_df.shape)
print(test_df.info())
dataset = pd.concat([train_df, test_df], axis=0)
print(dataset.shape)
print(dataset.info())
dataset = pd.DataFrame(shuffle(dataset))
print(dataset.shape)
(88799, 785) <class 'pandas.core.frame.DataFrame'> RangeIndex: 88799 entries, 0 to 88798 Columns: 785 entries, 0 to 784 dtypes: int64(785) memory usage: 531.8 MB None (14799, 785) <class 'pandas.core.frame.DataFrame'> RangeIndex: 14799 entries, 0 to 14798 Columns: 785 entries, 0 to 784 dtypes: int64(785) memory usage: 88.6 MB None (103598, 785) <class 'pandas.core.frame.DataFrame'> Index: 103598 entries, 0 to 14798 Columns: 785 entries, 0 to 784 dtypes: int64(785) memory usage: 621.2 MB None (103598, 785)
Splitting the dataset into features and labels¶
In [ ]:
labels = dataset.iloc[:, 0]
print(labels.shape)
features = dataset.iloc[:, 1:]
print(features.shape)
(103598,) (103598, 784)
Building label map¶
In [ ]:
mp = dict()
for i in range(0, 26):
mp[i+1] = chr(ord('a') + i)
print(mp)
{1: 'a', 2: 'b', 3: 'c', 4: 'd', 5: 'e', 6: 'f', 7: 'g', 8: 'h', 9: 'i', 10: 'j', 11: 'k', 12: 'l', 13: 'm', 14: 'n', 15: 'o', 16: 'p', 17: 'q', 18: 'r', 19: 's', 20: 't', 21: 'u', 22: 'v', 23: 'w', 24: 'x', 25: 'y', 26: 'z'}
Preprocessing images so all have same orientation and shuffling¶
In [ ]:
features = np.array(features)
print(features.shape)
features = features / 255.0
features_image = features.reshape(-1, 28, 28, 1) # adjust on the number of rows, each image is grey 28 * 28
features_image = np.rot90(features_image, axes=(1, 2))
features_image = np.flip(features_image, axis=1)
(103598, 784)
Drawing images function¶
In [ ]:
def draw_images(images, row_count, column_count):
fig, axs = plt.subplots(row_count, column_count, figsize=(10,10))
for i in range(row_count):
for j in range(column_count):
axs[i,j].imshow(images[i * column_count + j], cmap="gray")
axs[i,j].axis('off')
plt.show()
indices = np.random.choice(features_image.shape[0], 300) # choose from 0 to 103598 (shape[0]) 300 element
draw_images(features_image[indices].squeeze(), 15, 15)
train_test_split And fitting model using params from merged¶
In [ ]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(features, labels, train_size=0.8,
test_size=0.2, stratify=labels, random_state=42)
classifier = DecisionTreeClassifier(ccp_alpha= 0.0, class_weight= None, criterion='entropy',
max_depth=20, max_features= None, max_leaf_nodes= None,
min_impurity_decrease= 0.0, min_samples_leaf= 4, min_samples_split= 10,
min_weight_fraction_leaf= 0.0, random_state= 42, splitter= 'best')
classifier.fit(X_train, y_train)
joblib.dump(classifier, 'best_DecisonTree_model_merged.joblib')
y_pred = classifier.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
confusionMatrix = confusion_matrix(y_test, y_pred)
print("classifier.score(X_test, y_test): ", classifier.score(X_test, y_test))
print(f'Accuracy of the Best Model: {accuracy}')
print("Classification Report: \n", classification_report(y_test, y_pred))
print("Confusion Matrix: \n", confusionMatrix)
classifier.score(X_test, y_test): 0.7071911196911197
Accuracy of the Best Model: 0.7071911196911197
Classification Report:
precision recall f1-score support
1 0.57 0.60 0.58 839
2 0.67 0.70 0.68 839
3 0.79 0.83 0.81 844
4 0.66 0.68 0.67 840
5 0.75 0.76 0.76 848
6 0.66 0.70 0.68 839
7 0.50 0.49 0.50 837
8 0.68 0.71 0.69 845
9 0.60 0.63 0.61 846
10 0.76 0.78 0.77 840
11 0.69 0.69 0.69 848
12 0.63 0.62 0.62 843
13 0.81 0.85 0.83 840
14 0.69 0.72 0.70 833
15 0.83 0.82 0.82 842
16 0.76 0.79 0.77 846
17 0.56 0.48 0.51 847
18 0.73 0.69 0.71 844
19 0.81 0.79 0.80 758
20 0.65 0.66 0.66 687
21 0.75 0.75 0.75 684
22 0.78 0.79 0.79 684
23 0.82 0.74 0.78 684
24 0.75 0.69 0.72 687
25 0.75 0.71 0.73 691
26 0.82 0.75 0.78 685
accuracy 0.71 20720
macro avg 0.71 0.71 0.71 20720
weighted avg 0.71 0.71 0.71 20720
Confusion Matrix:
[[507 18 14 23 6 11 30 33 1 0 13 2 18 42 17 7 29 20
7 3 18 2 11 4 0 3]
[ 17 587 11 38 20 11 30 26 5 4 11 5 1 7 8 7 14 5
14 3 3 0 0 1 0 11]
[ 11 9 702 1 32 6 10 1 1 2 11 7 3 2 8 7 9 4
1 8 2 1 0 0 2 4]
[ 18 49 2 575 1 11 14 18 8 19 10 3 1 8 35 4 7 3
8 4 14 4 3 5 5 11]
[ 14 15 52 4 646 14 12 3 2 0 12 2 4 4 2 8 5 15
6 17 1 1 1 1 0 7]
[ 12 5 5 4 18 584 16 4 9 4 10 6 7 7 1 60 11 15
10 28 0 3 3 9 4 4]
[ 34 31 11 12 22 25 411 4 3 24 6 5 8 5 6 11 132 5
36 10 7 4 4 6 9 6]
[ 31 23 0 11 4 10 3 601 6 3 25 17 23 27 3 1 5 7
2 8 14 2 3 14 1 1]
[ 2 3 0 14 2 13 7 4 534 29 4 185 0 0 1 6 6 4
3 7 0 1 0 2 8 11]
[ 3 12 4 27 2 4 8 2 25 657 1 13 1 2 3 3 5 0
14 24 4 3 4 4 9 6]
[ 6 6 13 14 12 21 4 31 1 0 588 3 9 18 0 4 2 36
1 11 15 6 1 36 3 7]
[ 2 5 6 8 0 7 8 15 229 5 7 522 0 1 0 0 2 1
1 9 1 1 0 1 3 9]
[ 15 9 0 1 3 3 3 19 1 2 8 0 711 31 2 2 1 0
0 2 6 3 15 3 0 0]
[ 30 3 2 8 2 2 7 42 0 2 13 1 36 600 8 2 6 10
1 4 5 6 25 8 7 3]
[ 20 13 11 42 8 3 15 1 0 5 0 1 5 6 687 1 11 1
5 1 1 0 4 0 0 1]
[ 5 1 0 8 4 60 9 2 7 2 3 7 0 2 4 665 18 10
0 18 1 7 1 0 9 3]
[ 63 31 14 13 15 18 129 7 6 7 6 1 4 11 22 27 403 11
12 15 10 5 3 4 8 2]
[ 34 3 17 1 23 17 6 8 3 2 31 4 5 10 1 22 10 582
2 24 1 14 3 7 11 3]
[ 7 14 2 6 3 8 33 1 5 40 4 0 1 2 4 0 8 3
600 3 0 1 0 2 9 2]
[ 4 9 9 10 15 30 8 6 18 19 7 12 8 1 0 19 6 20
3 453 0 1 4 1 17 7]
[ 19 5 0 14 0 0 9 9 0 11 12 2 4 20 11 1 6 4
1 1 514 33 4 2 2 0]
[ 3 4 2 8 3 2 3 2 2 4 2 3 2 11 3 2 2 8
0 5 36 538 10 4 25 0]
[ 10 3 6 10 1 4 6 14 1 4 4 1 14 40 3 1 4 2
4 3 23 7 509 2 5 3]
[ 13 0 0 11 8 9 8 29 6 2 48 5 4 4 0 0 3 12
5 4 1 8 1 475 26 5]
[ 3 5 0 7 0 5 16 4 6 12 4 9 3 5 0 9 14 11
4 16 5 33 5 24 491 0]
[ 12 13 8 5 13 6 11 2 12 9 13 12 1 4 0 2 7 5
4 12 1 2 3 15 2 511]]
Drawing ROC Curve¶
In [ ]:
from sklearn.preprocessing import LabelBinarizer
y_score = classifier.predict_proba(X_test)
print("Train labels: ")
print(pd.DataFrame(y_train).value_counts())
print("----------------------------------------------------------------------------")
print(pd.DataFrame(y_score).value_counts())
print("y_score_proba: shape ", y_score.shape)
print("----------------------------------------------------------------------------")
print("Pred labels: ")
print(pd.DataFrame(y_pred).value_counts())
print("----------------------------------------------------------------------------")
print("test_labels: ")
print(pd.DataFrame(y_test).value_counts())
print("----------------------------------------------------------------------------")
print("train_labels: ")
print(pd.DataFrame(y_train).value_counts())
print("----------------------------------------------------------------------------")
print("train_labels: ")
print(np.array(y_train.shape))
label_binarizer = LabelBinarizer()
y_bin = label_binarizer.fit_transform(y_test)
print(y_score.shape, y_bin.shape, y_bin[0], y_test.shape)
fpr = dict()
tpr = dict()
roc_auc = dict()
n_classes = 26
for i in range(1, n_classes + 1):
fpr[i], tpr[i], _ = roc_curve(y_bin[:, i-1], y_score[:, i-1])
roc_auc[i] = auc(fpr[i], tpr[i])
# Plot ROC curves
plt.figure(figsize=(10, 8))
for i in range(1, n_classes + 1):
plt.plot(fpr[i], tpr[i], label=f'Class {mp[i]} (AUC = {roc_auc[i]:.2f})')
plt.plot([0, 1], [0, 1], 'k--', lw=2) # Diagonal line for random classifier
plt.xlabel('False Positive Rate (FPR)')
plt.ylabel('True Positive Rate (TPR)')
plt.title('ROC Curve for Each Class (One-vs-All)')
plt.legend(loc='lower right')
plt.show()
Train labels:
11 3390
5 3389
17 3388
16 3384
9 3382
8 3379
18 3375
3 3375
12 3372
15 3366
10 3362
13 3362
4 3358
2 3357
1 3356
6 3355
7 3348
14 3332
19 3034
25 2762
24 2750
20 2749
26 2742
22 2738
23 2738
21 2735
Name: count, dtype: int64
----------------------------------------------------------------------------
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25
0.0 0.000000 0.0 0.0 0.0 0.000000 0.0 0.0 0.00 0.000000 0.0 0.0 1.0 0.0 0.0 0.0 0.000000 0.0 0.0 0.000000 0.0 0.000000 0.000000 0.000 0.000000 0.000000 713
1.0 0.0 0.0 0.000000 0.0 0.0 0.00 0.000000 0.0 0.0 0.0 0.0 0.0 0.0 0.000000 0.0 0.0 0.000000 0.0 0.000000 0.000000 0.000 0.000000 0.000000 712
0.0 0.0 0.0 0.000000 0.0 0.0 0.00 0.000000 0.0 0.0 0.0 0.0 0.0 1.0 0.000000 0.0 0.0 0.000000 0.0 0.000000 0.000000 0.000 0.000000 0.000000 684
1.000000 0.0 0.0 0.0 0.0 0.0 0.0 0.000000 0.0 0.0 0.000000 0.0 0.000000 0.000000 0.000 0.000000 0.000000 660
1.0 0.00 0.000000 0.0 0.0 0.0 0.0 0.0 0.0 0.000000 0.0 0.0 0.000000 0.0 0.000000 0.000000 0.000 0.000000 0.000000 649
...
0.166667 0.0 0.0 0.00 0.166667 0.0 0.0 0.0 0.0 0.0 0.0 0.333333 0.0 0.0 0.000000 0.0 0.000000 0.166667 0.000 0.000000 0.166667 1
0.166667 0.0 0.0 0.0 0.000000 0.0 0.0 0.00 0.000000 0.0 0.0 0.0 0.0 0.0 0.0 0.166667 0.0 0.0 0.333333 0.0 0.000000 0.000000 0.000 0.333333 0.000000 1
0.000000 0.0 0.0 0.0 0.000000 0.0 0.0 0.25 0.000000 0.0 0.0 0.0 0.0 0.0 0.0 0.000000 0.0 0.0 0.000000 0.0 0.000000 0.000000 0.125 0.000000 0.625000 1
0.000 0.750000 0.000000 1
0.166667 0.0 0.0 0.0 0.000000 0.0 0.0 0.00 0.166667 0.0 0.0 0.0 0.0 0.0 0.0 0.500000 0.0 0.0 0.000000 0.0 0.166667 0.000000 0.000 0.000000 0.000000 1
Name: count, Length: 2374, dtype: int64
y_score_proba: shape (20720, 26)
----------------------------------------------------------------------------
Pred labels:
1 895
3 891
9 891
8 888
6 884
2 876
4 875
13 873
16 871
14 870
10 868
5 863
11 853
15 829
12 828
7 816
18 794
19 744
17 726
20 693
22 686
21 683
25 656
24 630
26 620
23 617
Name: count, dtype: int64
----------------------------------------------------------------------------
test_labels:
5 848
11 848
17 847
16 846
9 846
8 845
18 844
3 844
12 843
15 842
10 840
4 840
13 840
6 839
2 839
1 839
7 837
14 833
19 758
25 691
20 687
24 687
26 685
21 684
22 684
23 684
Name: count, dtype: int64
----------------------------------------------------------------------------
train_labels:
11 3390
5 3389
17 3388
16 3384
9 3382
8 3379
18 3375
3 3375
12 3372
15 3366
10 3362
13 3362
4 3358
2 3357
1 3356
6 3355
7 3348
14 3332
19 3034
25 2762
24 2750
20 2749
26 2742
22 2738
23 2738
21 2735
Name: count, dtype: int64
----------------------------------------------------------------------------
train_labels:
[82878]
(20720, 26) (20720, 26) [0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0] (20720,)
Confusion Matrix¶
In [ ]:
import seaborn as sns
ls = []
for i in range(n_classes):
ls.append(mp[i+1])
plt.figure(figsize=(12, 10))
sns.heatmap(confusionMatrix, annot=True, fmt='d', cmap='Blues', xticklabels=ls, yticklabels=ls)
plt.title('Confusion Matrix')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.show()
Plotting the Decision Tree¶
In [ ]:
from sklearn.tree import plot_tree
ls = []
for i in range(26):
ls.append(mp[i+1])
plt.figure(figsize=(64, 256))
plot_tree(classifier, filled=True, class_names=ls)
plt.show()